Code
library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese"),
Religion=relevel(Religion,ref="None")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |>
mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
`Primary Language` = as.factor(`Primary Language`))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
We will be using the interpretation set to run an analysis of deviance to check model performance.
rfdata <- qol |> select(`Unmet Health Need`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Unmet.Health.Need~.,
data=rfdata,
seed=3)$data
# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
# VSURF(Unmet.Health.Need~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Unmet.Health.Need~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Unmet.Health.Need ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 9.8 secs
VSURF selected:
18 variables at thresholding step (in 4.6 secs)
6 variables at interpretation step (in 2.7 secs)
1 variables at prediction step (in 2.5 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "English.Speaking"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "English.Speaking" "Duration.of.Residency" "Age"
[4] "Ethnicity" "Discrimination" "English.Difficulties"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.1033941
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 English.Speaking 0.01272 0.00044 black
2 Duration.of.Residency 0.01109 0.00060 black
3 Age 0.00999 0.00051 black
4 Ethnicity 0.00871 0.00049 red
5 Discrimination 0.00748 0.00052 black
6 English.Difficulties 0.00654 0.00035 black
7 Dental.Insurance 0.00629 0.00044 black
8 Religion 0.00515 0.00055 black
9 Full.Time.Employment 0.00275 0.00027 black
10 Belonging 0.00236 0.00046 black
11 Income_median 0.00222 0.00031 black
12 Health.Insurance 0.00210 0.00031 black
13 Identify.Ethnically 0.00196 0.00029 black
14 Primary.Language 0.00186 0.00026 black
15 Familiarity.with.America 0.00173 0.00034 black
16 US.Born 0.00144 0.00015 black
17 Gender 0.00065 0.00023 black
18 Familiarity.with.Ethnic.Origin 0.00060 0.00030 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_unmethealth.png", width=6, height=4.5,units="in")all_formula <- Unmet.Health.Need~.
pred_vars <- names(rfdata[,-1])[vsurf.mod$varselect.interp]
pred_vars <- ifelse("Ethnicity" %in% pred_vars, pred_vars,c(pred_vars,"Ethnicity"))
mod_form <- reformulate(pred_vars, response="Unmet.Health.Need")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Health.Need")
options(contrasts = c("contr.treatment","contr.poly"))
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Unmet.Health.Need ~ English.Speaking
Model 2: Unmet.Health.Need ~ English.Speaking
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1970 1243.3
2 1970 1243.3 0 0
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1273.609 1273.609
car::Anova(mod1, test.statistic = "F")Analysis of Deviance Table (Type II tests)
Response: Unmet.Health.Need
Error estimate based on Pearson residuals
Sum Sq Df F value Pr(>F)
English.Speaking 68.93 3 22.93 1.37e-14 ***
Residuals 1974.00 1970
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.1550 0.2391 -4.830 1.36e-06 ***
English.SpeakingNot well -0.3543 0.2672 -1.326 0.185
English.SpeakingVery well -1.7448 0.2879 -6.060 1.36e-09 ***
English.SpeakingWell -1.1893 0.2783 -4.273 1.93e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1312.2 on 1973 degrees of freedom
Residual deviance: 1243.3 on 1970 degrees of freedom
AIC: 1251.3
Number of Fisher Scoring iterations: 5
rfdata <- qol |> select(`Unmet Dental Needs`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
# imbal <- ROSE::ROSE(Unmet.Health.Need~.,
# data=rfdata,
# seed=3)$data
# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
# VSURF(Unmet.Health.Need~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Unmet.Dental.Needs~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Unmet.Dental.Needs ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 7.5 secs
VSURF selected:
17 variables at thresholding step (in 4.6 secs)
1 variables at interpretation step (in 2.5 secs)
1 variables at prediction step (in 0.4 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Dental.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Dental.Insurance"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.1146596
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.01348 0.00064 black
2 Age 0.01004 0.00060 black
3 English.Speaking 0.00917 0.00067 black
4 Duration.of.Residency 0.00865 0.00065 black
5 Religion 0.00721 0.00048 black
6 Income_median 0.00542 0.00039 black
7 Ethnicity 0.00404 0.00048 red
8 Familiarity.with.America 0.00361 0.00041 black
9 Primary.Language 0.00345 0.00030 black
10 English.Difficulties 0.00318 0.00058 black
11 Full.Time.Employment 0.00196 0.00033 black
12 US.Born 0.00153 0.00016 black
13 Familiarity.with.Ethnic.Origin 0.00152 0.00034 black
14 Health.Insurance 0.00138 0.00033 black
15 Discrimination 0.00092 0.00031 black
16 Identify.Ethnically 0.00074 0.00031 black
17 Belonging 0.00064 0.00026 black
18 Gender -0.00008 0.00024 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_unmetdental.png", width=6, height=4.5,units="in")Dental Insurance is the only variable in the interpretation set, which means Ethnicity might not be a significant predictor of unmet dental needs.
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Unmet.Dental.Needs")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Dental.Needs")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Unmet.Dental.Needs ~ Dental.Insurance
Model 2: Unmet.Dental.Needs ~ Dental.Insurance + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1966 1300.0
2 1961 1291.3 5 8.7216 0.1207
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1344.41 1315.208
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.53967 0.16386 -9.396 <2e-16 ***
Dental.InsuranceYes -1.39569 0.15702 -8.889 <2e-16 ***
EthnicityAsian Indian -0.18551 0.24169 -0.768 0.4427
EthnicityFilipino 0.30009 0.28968 1.036 0.3002
EthnicityKorean 0.35927 0.21118 1.701 0.0889 .
EthnicityOther -0.05362 0.38835 -0.138 0.8902
EthnicityVietnamese 0.33570 0.21855 1.536 0.1245
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1395.0 on 1967 degrees of freedom
Residual deviance: 1291.3 on 1961 degrees of freedom
AIC: 1305.3
Number of Fisher Scoring iterations: 5
The non-significant analysis of deviance implies that Ethnicity might not be a significant predictor of unmet dental needs.
rfdata <- qol |>
select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
# imbal <- ROSE::ROSE(Physical.Check.up~.,
# data=rfdata,
# seed=3)$data
#
# VSURF(Physical.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
# VSURF(Physical.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Physical.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Physical.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 12.9 secs
VSURF selected:
17 variables at thresholding step (in 6.3 secs)
5 variables at interpretation step (in 3.4 secs)
2 variables at prediction step (in 3.2 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Duration.of.Residency" "Health.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Duration.of.Residency" "Age" "Health.Insurance"
[4] "Dental.Insurance" "Income_median"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.2817396
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Duration.of.Residency 0.02978 0.00085 black
2 Age 0.02682 0.00089 black
3 Health.Insurance 0.02257 0.00060 black
4 Dental.Insurance 0.01358 0.00068 black
5 Income_median 0.00707 0.00048 black
6 Ethnicity 0.00670 0.00057 red
7 EnglishSpeak 0.00567 0.00045 black
8 EnglishDiff 0.00472 0.00067 black
9 Employment 0.00456 0.00030 black
10 Religion 0.00413 0.00050 black
11 Familiarity.with.America 0.00268 0.00035 black
12 Gender 0.00247 0.00040 black
13 Familiarity.with.Ethnic.Origin 0.00223 0.00043 black
14 Primary.Language 0.00116 0.00028 black
15 US.Born 0.00093 0.00028 black
16 Discrimination 0.00064 0.00035 black
17 Identify.Ethnically 0.00047 0.00050 black
18 Belonging 0.00020 0.00040 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_PC.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Physical.Check.up")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Physical.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance +
Dental.Insurance + Income_median
Model 2: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance +
Dental.Insurance + Income_median + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1960 2152.4
2 1955 2115.6 5 36.766 6.673e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 2199.048 2197.895
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.002804 0.225453 -8.883 < 2e-16 ***
Duration.of.Residency 0.032732 0.005457 5.998 2.00e-09 ***
Age 0.019095 0.003837 4.976 6.48e-07 ***
Health.InsuranceYes 1.190926 0.161703 7.365 1.77e-13 ***
Dental.InsuranceYes 0.544998 0.126932 4.294 1.76e-05 ***
Income_medianAbove 0.370270 0.119702 3.093 0.00198 **
EthnicityAsian Indian 0.097049 0.159050 0.610 0.54174
EthnicityFilipino 0.435205 0.216839 2.007 0.04474 *
EthnicityKorean -0.486941 0.157210 -3.097 0.00195 **
EthnicityOther -0.259744 0.248770 -1.044 0.29643
EthnicityVietnamese 0.438791 0.173259 2.533 0.01132 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2466.2 on 1965 degrees of freedom
Residual deviance: 2115.6 on 1955 degrees of freedom
AIC: 2137.6
Number of Fisher Scoring iterations: 4
rfdata <- qol |>
select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
# imbal <- ROSE::ROSE(Physical.Check.up~.,
# data=rfdata,
# seed=3)$data
#
# VSURF(Physical.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
# VSURF(Physical.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Dentist.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Dentist.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 10 secs
VSURF selected:
18 variables at thresholding step (in 5.5 secs)
3 variables at interpretation step (in 3.1 secs)
3 variables at prediction step (in 1.4 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Dental.Insurance" "Duration.of.Residency" "Religion"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Dental.Insurance" "Duration.of.Residency" "Religion"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.2958695
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.05153 0.00118 black
2 Duration.of.Residency 0.04922 0.00089 black
3 Religion 0.01600 0.00076 black
4 Ethnicity 0.01322 0.00082 red
5 Age 0.01149 0.00064 black
6 Health.Insurance 0.00828 0.00042 black
7 EnglishSpeak 0.00781 0.00060 black
8 Income_median 0.00704 0.00055 black
9 Identify.Ethnically 0.00395 0.00053 black
10 EnglishDiff 0.00328 0.00062 black
11 Familiarity.with.Ethnic.Origin 0.00287 0.00047 black
12 Familiarity.with.America 0.00260 0.00039 black
13 Belonging 0.00222 0.00055 black
14 Gender 0.00218 0.00046 black
15 Employment 0.00212 0.00020 black
16 US.Born 0.00177 0.00021 black
17 Primary.Language 0.00172 0.00035 black
18 Discrimination 0.00087 0.00024 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_DC.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Dentist.Check.up")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Dentist.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency +
Religion
Model 2: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency +
Religion + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1952 2209.7
2 1947 2201.0 5 8.7418 0.1198
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 2307.098 2277.933
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.975528 0.152696 -6.389 1.67e-10 ***
Dental.InsuranceYes 1.609345 0.108703 14.805 < 2e-16 ***
Duration.of.Residency 0.045686 0.004692 9.738 < 2e-16 ***
ReligionBuddhist 0.246415 0.205398 1.200 0.2303
ReligionCatholic 0.126604 0.202056 0.627 0.5309
ReligionHindu -0.292057 0.326191 -0.895 0.3706
ReligionMuslim -0.251392 0.402636 -0.624 0.5324
ReligionOther -0.522105 0.423661 -1.232 0.2178
ReligionProtestant 0.003237 0.170527 0.019 0.9849
EthnicityAsian Indian -0.498288 0.312440 -1.595 0.1108
EthnicityFilipino -0.075560 0.238626 -0.317 0.7515
EthnicityKorean -0.414204 0.173855 -2.382 0.0172 *
EthnicityOther -0.318398 0.268028 -1.188 0.2349
EthnicityVietnamese -0.331430 0.187937 -1.764 0.0778 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2659.6 on 1960 degrees of freedom
Residual deviance: 2201.0 on 1947 degrees of freedom
AIC: 2229
Number of Fisher Scoring iterations: 4
rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
# imbal <- ROSE::ROSE(Folkmedicine~.,
# data=rfdata,
# seed=3)$data
# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Folkmedicine~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Folkmedicine ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 9.2 secs
VSURF selected:
17 variables at thresholding step (in 4.7 secs)
3 variables at interpretation step (in 2.7 secs)
1 variables at prediction step (in 1.8 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Age"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Age" "Duration.of.Residency" "Ethnicity"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.1382383
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Age 0.01716 0.00066 black
2 Duration.of.Residency 0.01315 0.00055 black
3 Ethnicity 0.01311 0.00065 red
4 English.Speaking 0.00899 0.00058 black
5 Religion 0.00684 0.00066 black
6 Income_median 0.00549 0.00036 black
7 English.Difficulties 0.00472 0.00043 black
8 Primary.Language 0.00262 0.00026 black
9 Familiarity.with.Ethnic.Origin 0.00262 0.00032 black
10 Familiarity.with.America 0.00260 0.00035 black
11 Full.Time.Employment 0.00235 0.00042 black
12 Discrimination 0.00127 0.00034 black
13 Dental.Insurance 0.00115 0.00023 black
14 Health.Insurance 0.00088 0.00020 black
15 Identify.Ethnically 0.00070 0.00037 black
16 US.Born 0.00067 0.00014 black
17 Gender 0.00055 0.00026 black
18 Belonging 0.00013 0.00033 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_AltMedicine.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp])
mod_form <- reformulate(pred_vars, response="Folkmedicine")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Folkmedicine")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Folkmedicine ~ Age + Duration.of.Residency
Model 2: Folkmedicine ~ Age + Duration.of.Residency + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1944 1507.7
2 1939 1450.9 5 56.75 5.693e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1511.497 1530.377
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.612602 0.219844 -11.884 < 2e-16 ***
Age 0.022611 0.004636 4.877 1.08e-06 ***
Duration.of.Residency 0.007417 0.005960 1.244 0.213371
EthnicityAsian Indian -0.809613 0.218076 -3.713 0.000205 ***
EthnicityFilipino -0.742468 0.272122 -2.728 0.006364 **
EthnicityKorean 0.234543 0.173006 1.356 0.175197
EthnicityOther -0.735426 0.357019 -2.060 0.039408 *
EthnicityVietnamese -1.084595 0.232545 -4.664 3.10e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1552.9 on 1946 degrees of freedom
Residual deviance: 1450.9 on 1939 degrees of freedom
AIC: 1466.9
Number of Fisher Scoring iterations: 5